In [15]:
import json
import requests
from bs4 import BeautifulSoup
from datetime import date
import time
import random
import pickle
import pandas as pd
import numpy as np
import re
import os

In [16]:
import undetected_chromedriver as uc
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException

In [17]:
wait_time = 3
def loadDataframe():
    dataFrameEmpty = {
        'id': [],
        'title': [],
        'company': [],
        'location': [],
        "salary": [],
        'link': [],
        "retrieved_on": [],
        "post_data": [],
        "description": []
        
    }
    wait_time = 1
    df = pd.DataFrame(dataFrameEmpty)
    if os.path.exists('job_search_database.csv'):
        df = pd.read_csv('job_search_database.csv')
    return df


In [18]:
job_id_cache = {}
if os.path.exists('job_id_cache.pickle'):
    with open('job_id_cache.pickle', 'rb') as f:
        # Pickle the dictionary
        job_id_cache = pickle.load(f)

In [19]:
blackList = []
if os.path.exists("word_blacklist.json"):
    with open("word_blacklist.json", "r") as f:
        data = json.load(f)
        blackList = data["blacklist"]

def checkBlackList(text):
    for elm in blackList:
        if re.search(elm, text.lower()) is not None:
            return False
    return True

In [20]:
def saveDictionary(name, dict):
    with open(name, 'wb') as f:
        pickle.dump(dict, f)

In [21]:
def get_element_text_by_classname(driver, class_name):
    found = driver.find_elements(By.CSS_SELECTOR, "." + class_name)
    if len(found) <= 0:
        return ""
    return found[0].text

In [22]:
def login(driver):
    driver.get("https://www.linkedin.com/login")
    time.sleep(wait_time)
    with open('credentials.json') as f:
        data = json.load(f)
        USERNAME = data['username']
        PASSWORD = data['password']
    
        username = driver.find_element(By.ID, "username")
        username.send_keys(USERNAME)
        time.sleep(wait_time)
        
        pword = driver.find_element(By.ID, "password")
        pword.send_keys(PASSWORD) 
    time.sleep(wait_time)
    
    driver.find_element(By.XPATH, "//button[@type='submit']").click()
    time.sleep(wait_time)

In [23]:
def getJobData(driver):
    job_posts = []
    resultList = driver.find_element(By.CSS_SELECTOR, ".scaffold-layout__list-container")
    jobs = resultList.find_elements(By.CSS_SELECTOR, ".jobs-search-results__list-item")
    for job in jobs:
        job_id = job.get_attribute('data-occludable-job-id')
        
        if (checkBlackList(job.text) and job_id not in job_id_cache):
            job_id_cache[job_id] = True
            jobTitleContainerList = job.find_elements(By.CSS_SELECTOR, ".job-card-list__title")
            jobMetaData = job.find_elements(By.CSS_SELECTOR, ".job-card-container__metadata-item")
            posting_link = ""
            job_title = ""
            salary = ""
            location = ""
            today = date.today()
            if len(jobMetaData) > 1:
                location = jobMetaData[0].text
            if len(jobMetaData) > 2:
                salary = jobMetaData[1].text
            if len(jobTitleContainerList) > 0:
                posting_link = jobTitleContainerList[0].get_attribute('href')
                job_title = jobTitleContainerList[0].text
            company_name = get_element_text_by_classname(job, "job-card-container__primary-description")
            if job_title != "" and checkBlackList(job_title):
                job_posts.append([job_id, job_title, company_name, location, salary, posting_link, today.strftime('%B %d, %Y')])
    return job_posts

In [24]:
def getJobDescriptions(driver, listOfJobs):
    newJobList = []
    for idx in range(len(listOfJobs)):
        job = listOfJobs[idx]
        print(job)
        driver.get(job[5])
        tvm_text = get_element_text_by_classname(driver, "tvm__text tvm__text--neutral")
        # click expand button
        driver.find_element(By.CSS_SELECTOR, ".jobs-description__footer-button").click()
        description = get_element_text_by_classname(driver, "jobs-description-content__text")
        if (checkBlackList(description[17:])):
            newJobList.append(listOfJobs[idx] + [tvm_text] + [description[17:]])
        
        time.sleep(random.randint(0, 30))
        
    return newJobList


In [25]:
def openBrowserAndSearch():
    options = Options()
    options.add_argument('--ignore-certificate-errors')
    driver = uc.Chrome(options)

    login(driver)
    time.sleep(wait_time)
    # Go to job search page
    URL = "https://www.linkedin.com/jobs/search/?currentJobId=3822054670&f_E=2&f_TPR=r2592000&geoId=100017349&keywords=software%20engineer&location=80247%2C%20Denver%2C%20Colorado%2C%20United%20States&origin=JOB_SEARCH_PAGE_SEARCH_BUTTON&refresh=true"
    driver.get(URL)
    time.sleep(wait_time)
    return driver

def main():
    df = loadDataframe()
    driver = openBrowserAndSearch()
    jobsOnPage = getJobData(driver)
    jobsOnPageWithDescription = getJobDescriptions(driver, jobsOnPage)
    for job in jobsOnPageWithDescription:
        df.loc[len(df.index)] = job
        
    df.to_csv("job_search_database.csv", sep=',', index=False, encoding='utf-8')
    saveDictionary("job_id_cache.pickle", job_id_cache)

In [26]:
main()

['3841120639', 'Software Development Engineer in Test', 'Charles Schwab', 'Lone Tree, CO (On-site)', '', 'https://www.linkedin.com/jobs/view/3841120639/?eBP=CwEAAAGOLz3grOZbT_dF7Wtq0YsYFdJrEiKyDCW46DqwOnsXsBOc9tstsl7bHOCa5TTchUovPx1UEiMv4UNNv8FG6rlKnpA9H6Usf7_pn8GlJOuOshrXqPOSfq9I6K7twzDs4_isXtqnjnTV15IyteNT5FIvix0AgtSGLUrOcG8kxvDxXtWxIXeMFAvtJkGuYs-ar5ZQIwmB-tPRXpAr41uvq-2KyfEOCSWwNmbaI3RZDwTuwKzQ6n2h8jLMNHVVRf5cJfutwU1dZWvVbqjVNZeLMzl3S2dbFjVQ6wDs4BZvMF6pA7oW7QyiqSOiNJRN8GX-Agunh1l0udWL9G38_kKxYJ-Sv0WqclSMo3BSaJIe2lJZFgd5ZNzKmK1Q8-4uE8_HS1AJRPH5SxnPTBm6Zk05kC-1LIo&refId=tsVJw8YYInwnA9Zf%2BIIpug%3D%3D&trackingId=WmRD6yzLfaUVeKZ43S%2BzGw%3D%3D&trk=flagship3_search_srp_jobs', 'March 11, 2024']
['3822054670', 'Java Developer', 'Tata Consultancy Services', '', '', 'https://www.linkedin.com/jobs/view/3822054670/?eBP=CwEAAAGOLz3grBQGxzNHj8rRbtAIJPEzxvI09M3mrg3osx8ocQoun-dfGl2w0pTjSdvLbH3VxP92MOJTGuUkCy8_kWGoeP5VZ4aIKf8S02XroVl1GXvRpRF2RIrvWZeqCrHEl3SNBqWWn1wbKBqOZWBB1G9CJhB1bDlYg17j0eQkUqMh