In [1]:
import time
import datetime
import os
import csv
import logging
from dotenv import load_dotenv

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec


def create_logfile():
    """
    :returns logging
    """
    date = datetime.datetime.today().strftime('%d-%b-%y_%H:%M:%S')
    logfile = f"log/{date}.log"
    logging.basicConfig(filename=logfile, filemode='w', level=logging.INFO, \
                        format='%(asctime)s - %(levelname)s - %(message)s', \
                        datefmt='%d-%b-%y %H:%M:%S', force=True)
    logging.info(f"Log file {logfile} created.")
    return logging


def create_file(file, logging):
    """
    :param file
    :param logging

    :returns nothing
    """

    # if the daily file already exists, make sure to delete it to not clutter disk space
    logging.info("Checking if current daily csv already exists...")
    if os.path.exists(file):
        os.remove(file)
        logging.info(f"     ...{file} deleted")
    else:
        logging.info(f"     ...{file} not found")

    # create file and add header
    logging.info("Creating daily CSV...")
    header = ['date_time', 'search_keyword', 'search_count', 'job_id', 'job_title', \
              'company' 'location', 'remote', 'update_time', 'applicants', 'job_pay', \
              'job_time', 'job_position', 'company_size', 'company_industry', \
              'job_details']
    with open(file, 'w') as f:
        w = csv.writer(f)
        w.writerow(header)
        logging.info(f"     ...{file} created")


def login(logging):
    """
    :param logging

    :returns wd - Chromium webdriver object from the selenium library
    """

    url_login = 'https:///www.linkedin.com/'
    load_dotenv()

    # From .env file
    LINKEDIN_USERNAME = os.getenv('LINKEDIN_USERNAME')
    LINKEDIN_PASSWORD = os.getenv('LINKEDIN_PASSWORD')

    # create headless instance of Chrome
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--window-size=1920,1080')

    # Actually log in to LinkedIn
    logging.info(f"Looging into LinkedIn as {LINKEDIN_USERNAME}...")
    wd = webdriver.Chrome(executable_path='./chromedriver', options=chrome_options)
    wd.get(url_login)
    wd.find_element_by_id('session_key').send_keys(LINKEDIN_USERNAME)
    wd.find_element_by_id('session_password').send_keys(LINKEDIN_PASSWORD)
    wd.find_element_by_xpath("//button[@class='sign-in-form__submit-button']").click()

    # not sure how often this happens, but sometimes there's a popup to confirm login info
    try:
        wd.find_element_by_xpath("//buton[@class='primary-action-new']").click()
    except:
        pass
    logging.info("     ...logged in")
    return wd


def page_search(wd, search_location, search_keyword, search_remote, search_posted, search_page, search_count, file, logging):
    """
    parameters:
    - wd
    - search_location
    - search_keyword
    - search_remote
    - search_posted
    - search_page
    - search_count
    - file
    - logging
    """

    page_wait = 30
    click_wait = 5
    async_wait = 5
    retry_attempts = 3

    url_search = f'https://www.linkedin.com/jobs/search/?f_TPR={search_posted}&f_WT={search_remote}&geoId=103644278&keywords={search_keyword}&location={search_location}&start={search_page}'

    logging.info("Navigating to jobs page...")
    wd.get(url_search)
    time.sleep(page_wait) # sneaky sneaky sleep
    logging.info("     ...succeeded")

    # original parameter: "//small[@class='display-flex t-12 t-black--light t-normal']"
    # second parameter: "//small[@class='jobs-search-results-list__text display-flex t-12 t-black--light t-normal']"
    # third parameter: "//small[@class='jobs-search-results-list__text display-flex t-12 t-black--light t-normal'] [@aria-live='polite']"
    # fourth parameter: "/html/body/div[5]/div[3]/div[3]/div[2]/div/section[1]/div/header/div[1]/small"

    # search_count = wd.find_element_by_xpath("//small[@class='jobs-search-results-list__text display-flex t-12 t-black--light t-normal']").text
    search_count = wd.find_element_by_css_selector("small.jobs-search-results-list__text").text
    search_count = int(search_count.split(' ')[0].replace(',', ''))
    logging.info(f"Loading page {round(search_page/25) + 1} of {round(search_count/25)} for {search_keyword}'s {search_count} results...")

    # collects job_ids for the current page
    for attempt in range(retry_attempts):
        try:
            search_results = wd.find_element_by_xpath("//ul[@class='jobs-search-results__list list-style-none']").find_elements_by_tag_name("li")
            result_ids = [result.get_attribute('id') for result in search_results if result.get_attribute('id') != '']
            break
        except:
            # EXCEPTION:
            # wait a few attempts, if not throw an exception and then skip to next page
            time.sleep(click_wait)

    # cycle through each id and append the job data to a new list called list_jobs
    list_jobs = []
    for res_id in result_ids:
        try:
            job = wd.find_element_by_id(res_id)
            job_id = job.get_attribute("data-occludable-entity-urn").split(':')[-1]
            wd.find_element_by_xpath(f"//div[@data-job-id={job_id}").click()
        except:
            # EXCEPTION:
            # exception probably caused by the job posting being deleted?
            # either way, probably better to just except it here and skip forward
            continue

        for attempt in range(retry_attempts):
            try:
                job_title = wd.find_element_by_xpath("//h2[@class='t-24 t-bold']")
                job_title = job_title.text
                break
            except:
                # EXCEPTION:
                # having some issues with the xpath thing up there ^
                # making an exception here to just wait for the click delay
                # then move to the next job
                job_title = ''
                time.sleep(click_wait)
        
        # Get Company and Location (and if the job is remote if applicable)
        for attempt in range(retry_attempts):
            try:
                job_top_card = wd.find_element_by_xpath("//span[@class='jobs-unified-top-card_subtitle-primary-grouping mr2 t-black']").find_elements_by_tag_name("span")
                company = job_top_card[0].text
                location = job_top_card[1].text
                if len(job_top_card) > 2:
                    # the format of LinkedIn job cards is like 
                    # Company, Location, (Remote)
                    # so we'll only grab the Remote tag if it's present
                    remote = job_top_card[2].text
                else:
                    # Otherwise, set Remote to an empty string
                    remote = ''
                break
            except:
                company = ''
                location = ''
                remote = ''
                time.sleep(click_wait)

        # Get date posted (or reposted) and number of applicants
        for attempt in range(retry_attempts):
            try:
                job_top_card2 = wd.find_element_by_xpath("//span[@class='jobs-unified-top-card__subtitle-secondary-grouping t-black--light']").find_elements_by_tag_name("span")
                update_time = job_top_card2[0].text
                applicants = job_top_card2[1].text.split(' ')[0]
                break
            except:
                update_time = ''
                applicants = ''
                time.sleep(click_wait)

        job_time = ''
        job_position = ''
        job_pay = ''

        for attempt in range(retry_attempts):
            try:
                # make sure HTML element is loaded
                element = WebDriverWait(wd, 10).until(ec.presence_of_all_elements_located((By.XPATH, "//div[@class='mt5 mb2']/div[1]")))
                # make sure text is loaded
                try:
                    job_info = element.text
                    if job_info != '':
                        # separate job info on time requirements and position
                        job_info = job_info.split(" · ")
                        if len(job_info) == 1:
                            job_pay = ''
                            job_time = job_info[0]
                            job_position = ''
                        elif (len(job_info) >= 2) and ("$" in job_info[0]):
                            job_pay = job_info[0]
                            job_time = job_info[1]
                            if len(job_info) >= 3:
                                job_position = job_info[2]
                            else:
                                job_position = ''
                        else:
                            job_time = job_info[0]
                            job_position = job_info[1]
                            job_pay = ''
                        break
                    else:
                        time.sleep(async_wait)
                except:
                    # error means page didn't load so try again
                    time.sleep(async_wait)
            except:
                # error means page didn't load so try again
                time.sleep(async_wait)

        # get company details and seperate on size and industry
        company_size = '' # assigning as blanks as not important info and can skip if not obtained below
        company_industry = ''
        job_details = ''      
        for attempt in range(retry_attempts):
            try:
                company_details = wd.find_element_by_xpath("//div[@class='mt5 mb2']/div[2]").text
                if " · " in company_details:
                    company_size = company_details.split(" · ")[0]
                    company_industry = company_details.split(" · ")[1]
                else:
                    company_size = company_details
                    company_industry = ''
                job_details = wd.find_element_by_id("job-details").text.replace("\n", " ")
                break
            except: 
                time.sleep(click_wait)

        # append (a) line to file
        date_time = datetime.datetime.now().strftime("%d%b%Y-%H:%M:%S")
        search_keyword = search_keyword.replace("%20", " ")
        list_job = [date_time, search_keyword, search_count, job_id, job_title, company, location, remote, update_time, applicants, job_pay, job_time, job_position, company_size, company_industry, job_details]
        list_jobs.append(list_job)

    with open(file, "a") as f:
        w = csv.writer(f)
        w.writerows(list_jobs)
        list_jobs = []
    
    logging.info(f"Page {round(search_page/25) + 1} of {round(search_count/25)} loaded for {search_keyword}")
    search_page += 25

    return search_page, search_count, url_search

In [2]:
# create logging file
logging = create_logfile()

# create daily csv file
date = datetime.date.today().strftime('%d-%b-%y')
file = f"data/{date}.csv"
create_file(file, logging)

# login to linkedin and assign webdriver to variable
wd = login(logging)

# URL search terms here, try to limit to 3 or so
# 
search_keywords = ['Data Analyst', 'Data Scientist', 'Data Engineer']
search_location = "United%20States"
search_remote = "2" # filter for remote positions
search_posted = "r86400" # filter for past 24 hours

# Counting Exceptions
exception_first = 0
exception_second = 0

for search_keyword in search_keywords:
    search_keyword = search_keyword.lower().replace(" ", "%20")

# Loop through each page and write results to csv
    search_page = 0 # start on page 1
    search_count = 1 # initiate search count until looks on page
    while (search_page < search_count) and (search_page != 1000):
        # Search each page and return location after each completion
        try:
            search_page, search_count, url_search = page_search(wd, search_location, search_keyword, search_remote, search_posted, search_page, search_count, file, logging)
        except Exception as e:
            logging.error(f'(1) FIRST exception for {search_keyword} on {search_page} of {search_count}, retrying...')
            logging.error(f'Current URL: {url_search}')
            logging.error(e)
            logging.exception('Traceback ->')
            exception_first += 1
            time.sleep(5) 
            try:
                search_page, search_count, url_search = page_search(wd, search_location, search_keyword, search_remote, search_posted, search_page, search_count, file, logging)
                logging.warning(f'Solved Exception for {search_keyword} on {search_page} of {search_count}')
            except Exception as e:
                logging.error(f'(2) SECOND exception remains for {search_keyword}. Skipping to next page...')
                logging.error(f'Current URL: {url_search}')
                logging.error(e)
                logging.exception('Traceback ->')
                search_page += 25 # skip to next page to avoid entry
                exception_second += 1
                logging.error(f'Skipping to next page for {search_keyword}, on {search_page} of {search_count}...')

# close browser
wd.quit()

logging.info(f'LinkedIn data scraping complete with {exception_first} first and {exception_second} second exceptions')
logging.info(f'Regard all further alarms...')

  wd = webdriver.Chrome(executable_path='./chromedriver', options=chrome_options)
  wd.find_element_by_id('session_key').send_keys(LINKEDIN_USERNAME)
  wd.find_element_by_id('session_password').send_keys(LINKEDIN_PASSWORD)
  wd.find_element_by_xpath("//button[@class='sign-in-form__submit-button']").click()
  wd.find_element_by_xpath("//buton[@class='primary-action-new']").click()
  search_count = wd.find_element_by_css_selector("small.jobs-search-results-list__text").text
  search_results = wd.find_element_by_xpath("//ul[@class='jobs-search-results__list list-style-none']").find_elements_by_tag_name("li")
  job = wd.find_element_by_id(res_id)
  wd.find_element_by_xpath(f"//div[@data-job-id={job_id}").click()
