#### This module loads pickled url files and unpacks them into one list item then visits the links and scrapes the data

In [118]:
import pickle as pk
import os
from bs4 import BeautifulSoup
from tqdm import tqdm

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as conditions
from selenium.webdriver.common.by import By



In [119]:
def fetch_links():
    files = os.listdir("links/")
    urls = []
    for file in files:
        _ = open(("links/" + file), "rb")
        urls.append(pk.load(_))

    urls = [url for sublist in urls for url in sublist]

    return urls


## Cleaning Function

- This function takes in 'dirty' text and returns cleaned text devoid of formatting and markdown syntax. 
- This is not the final cleaning. Further preprocessing will be done when prepping the data for use by the NLP model. This just makes the data human-readable. 
- The steps taken are:
    1. The data is parsed using a BS4 html parser to remove HTML tags.
    2. Newlines are then replaced with spaces
    3. The '/xa0' character for newline is also removed
    4. '/u2019' character for apostrphe is replaced with an actual apostrophe
    5. '/u2013' unicode for dash is replaced with - if found

In [120]:
def parser(dirty):
    # removing HTML tags
    raw = BeautifulSoup(dirty, "html.parser").get_text()

    text = (
        raw.replace("\n", " ")
        .replace("\xa0", " ")
        .replace("\u2019", "'")
        .replace("\u2013", "-")
        .replace("\u2022","")
        .replace("\u201c","")
        .replace("\u201d",'"')
        .replace("\u2014","—")
        .replace("\uf0fc","")
        .replace("\u00b7","")
        .replace('\u2026',"")
        .replace("\u00e8","")
        .replace("\u00e9","")
    )

    # removing tab spaces and large spaces
    cleaned = " ".join(text.split())

    return cleaned



In [121]:
def scraper(driver, urls):
    jobs = {}
    skipped = 0
    
    
    for url in tqdm(urls[10:16], desc="Fetching"):
        try:
            xpath = {
                "company": '//h2[@class="pb-1 text-sm font-normal"]',
                "title": '//*[@id="tab1"]/div/article/div[2]/div[2]/h1',
                "description": "//*[@id='tab1']/div/article/div[5]/div",
                "location": '//*[@id="tab1"]/div/article/div[2]/div[2]/div[1]/*[1]',
                "nature": '//*[@id="tab1"]/div/article/div[2]/div[2]/div[1]/*[2]',
                "salary": '//*[@id="tab1"]/div/article/div[2]/div[2]/div[2]/span[1]/span',
                "posted": '//*[@id="tab1"]/div/article/div[3]/div[2]',
            }
            
            try:
                driver.get(url)
                WebDriverWait(driver, 10).until(
                    conditions.presence_of_element_located(
                        (By.XPATH, xpath["description"])
                    ),
                )
    
                #  get job details and clean innerHTML from posted and description
                company = driver.find_element_by_xpath(xpath["company"])
                title = driver.find_element_by_xpath(xpath["title"])
                location = driver.find_element_by_xpath(xpath["location"])
                nature = driver.find_element_by_xpath(xpath["nature"])
                salary = driver.find_element_by_xpath(xpath["salary"])
                
                # cleaning the output
                posted = parser(
                    driver.find_element_by_xpath(xpath["posted"]).get_attribute("innerHTML")
                )
                description = parser(
                    driver.find_element_by_xpath(xpath["description"]).get_attribute(
                        "innerHTML"
                    )
                )
                
                # updating the jobs dict with key value pairs of {url:{job details}}
                
                jobs.update(
                    {
                        url:{
                            "company": company.text,
                            "title": title.text,
                            "location": location.text,
                            "nature": nature.text,
                            "salary": (salary.text if len(salary.text) > 6 else "Unspecified"),
                            "description": description,
                            "posted": posted,
                        }
                    }
                )
          
            except TimeoutError:
                skipped +=1

        except NoSuchElementException:
            print("Hii stuff haiko buda")


    return jobs,skipped

In [122]:
def main():
    options = Options()
    options.add_argument(
        "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/109.0.0.0 Safari/537.36"
    )
    options.add_argument("--headless")
    options.add_argument("--disable-blink-features=AutomationControlled")
    driver = webdriver.Chrome(options=options)

    # call the functions
    urls = fetch_links()
    jobs,skipped = scraper(driver, urls)
    if skipped > 0:
        print(f"Completed {len(jobs)} jobs, {skipped} skipped due to timeout)")
        

if __name__ == "__main__":
    main()

Fetching: 100%|██████████| 6/6 [00:18<00:00,  3.13s/it]

{'https://www.brightermonday.co.ke/listings/javamicroservice-developer-javakotlinmicronaut-remote-first-vwmmg5': {'company': 'CaperWhite GmbH', 'title': 'Java/Microservice Developer (Java/Kotlin/Micronaut, Remote First)', 'location': 'Remote (Work From Home)', 'nature': '', 'salary': 'Unspecified', 'description': " No worries if you think you’re not a perfect fit — if your passion is software development and you're willing to learn, we’re willing to invest in you.  What You’ll Do  Find elegant solutions to hard technical problems, architecture design and driving our solution forwardDevelop JVM based microservices according to specifications and design conceptsWrite readable and well-documented codeInnovate using the latest JVM technologies, frameworks, and approachesManage your work by setting your own goals, prioritizing and executing them, and ultimately taking ownership of tasks and projects  What You Have  Demonstrated project experience and in-depth knowledge of Java, Kotlin or Gr


