#### This module loads pickled url files and unpacks them into one list item then visits the links and scrapes the data

In [3]:
import os
import random

from bs4 import BeautifulSoup
import json

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as conditions
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import re

import pickle as pk



In [8]:
def fetch_links():
    files = os.listdir("../Outputs/links/")
    urls = []

    # for file in files:
    #     print(f"Fetched {file}")
    #     _ = open(("../Outputs/links/" + file), "rb")
    #     urls.append(pk.load(_))
    
    # Isolated testing
    file = open('../Outputs/links/codingkenya.pk','rb') 
    urls.append(pk.load(file))

    urls = [url for sublist in urls for url in sublist]

    print(f"{len(urls)} urls have been unpacked from {len(files)} found files")
    return urls


## Cleaning Function

- This function takes in 'dirty' text and returns cleaned text devoid of formatting and markdown syntax. 
- This is not the final cleaning. Further preprocessing will be done when prepping the data for use by the NLP model. This just makes the data human-readable. 
- The steps taken are:
    1. The data is parsed using a BS4 html parser to remove HTML tags.
    2. Newlines are then replaced with spaces
    3. The '/xa0' character for newline is also removed
    4. '/u2019' character for apostrphe is replaced with an actual apostrophe
    5. '/u2013' unicode for dash is replaced with - if found

In [5]:
def parser(dirty):
        # removing HTML tags
        raw = BeautifulSoup(dirty, "html.parser").get_text()

        text = (
            raw.replace("\n", " ")
            .replace("\xa0", " ")
            .replace("\u2019", "'")
            .replace("\u2013", "-")
            .replace("\u2022", "")
            .replace("\u201c", "")
            .replace("\u201d", '"')
            .replace("\u2014", "—")
        )

        # removing tab spaces and large spaces
        cleaned = " ".join(text.split())

        return cleaned



In [11]:
def scraper(driver,urls):
        jobs = {}
        skipped = 0

        # using regex to determine the site that is about to be scraped and to adjust the xpath to be used accordingly
        brightermonday = re.compile(r"brightermonday", re.IGNORECASE)
        codingkenya = re.compile(r"codingkenya", re.IGNORECASE)
        myjobmag = re.compile(r"myjobmag", re.IGNORECASE)

        for url in urls:
            print(f"Working on {url}")
            try:
                if brightermonday.search(url):
                    print("Brighter MOnday Link Identified")
                    xpath = {
                        "company": '//h2[@class="pb-1 text-sm font-normal"]',
                        "title": '//*[@id="tab1"]/div/article/div[2]/div[2]/h1',
                        "description": "//*[@id='tab1']/div/article/div[5]/div",
                        "location": '//*[@id="tab1"]/div/article/div[2]/div[2]/div[1]/*[1]',
                        "nature": '//*[@id="tab1"]/div/article/div[2]/div[2]/div[1]/*[2]',
                        "salary": '//*[@id="tab1"]/div/article/div[2]/div[2]/div[2]/span[1]/span',
                        "posted": '//*[@id="tab1"]/div/article/div[3]/div[2]',
                    }
                elif codingkenya.search(url):
                    print("Coding Kenya Link Identified")
                    xpath = {
                        "title":"/html/body/div/div/header[2]/div/div/div[1]/h1",
                        "description":"/html/body/div/div/div/div/div/div[2]/main/div/div/div[1]/div[2]",
                        "location": "/html/body/div/div/header[2]/div/div/div[1]/div/ul/li[2]/a",
                        "nature": "/html/body/div/div/header[2]/div/div/div[1]/div/ul/li[1]",
                    }
                elif myjobmag.search(url):
                    print("MyJobMag Link Identified")
                    xpath = {
                        "title": "/html/body/section/div/div/div[1]/ul/li[3]/h2[1]/span",
                        "description": "//*[@id='printable']/div[2]",
                        "location": '//*[@id="printable"]/ul/li[4]/span[2]/a',
                        "nature": '//*[@id="printable"]/ul/li[1]/span[2]/a',
                        "salary": '//*[@id="tab1"]/div/article/div[2]/div[2]/div[2]/span[1]/span',
                        "posted": '//*[@id="posted-date"]',
                    }

                driver.get(url)
                try:
                    WebDriverWait(driver, 20).until(
                        conditions.presence_of_element_located(
                            (By.XPATH, xpath["description"])
                        ),
                    )

                    #  get job details and clean innerHTML from posted and description
                    title = driver.find_element(By.XPATH, xpath["title"])
                    location =driver.find_element(By.XPATH, xpath["location"])
                    nature = driver.find_element(By.XPATH, xpath["nature"])
                    # salary = self.driver.find_element_by_xpath(xpath["salary"])
                    description = parser(
                        dirty=driver.find_element(
                            By.XPATH, xpath["description"]
                        ).get_attribute("innerHTML")
                    )

                    print(title)

                    # updating the jobs dict with key value pairs of {url:{job details}}

                    jobs.update(
                        {
                            url: {
                                "title": title.text,
                                "location": location.text,
                                "nature": nature.text,
                                # "salary": (
                                #     salary.text
                                #     if len(salary.text) > 6
                                #     else "Unspecified"
                                # ),
                                "description": description,
                            }
                        }
                    )

                except TimeoutException:
                    skipped += 1

            except NoSuchElementException:
                print("Hii stuff haiko buda")
                return jobs, skipped
            except TimeoutException:
                return jobs, skipped

        return jobs, skipped

In [12]:
def main():
    options = Options()
    options.add_argument(
        "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/109.0.0.0 Safari/537.36"
    )
    options.add_argument("--headless")
    options.add_argument("--disable-blink-features=AutomationControlled")
    driver = webdriver.Chrome(options=options)

    # call the functions
    urls = fetch_links()
    jobs,skipped = scraper(driver, urls)
    if skipped > 0:
        print(f"Completed {len(jobs)} jobs, {skipped} skipped due to timeout)")
        

if __name__ == "__main__":
    main()

1000 urls have been unpacked from 3 found files
Working on https://codingkenya.com/job/full-stack-developer-7/
Coding Kenya Link Identified
About the jobWe are Quadcode, a company that develops a SaaS trading platform for clients around the world. We are currently looking for Full Stack Developer for an exciting new fintech venture.Sabio Trade is a trading firm dedicated to empowering talented traders to reach their full potential and attain financial success. Our commitment to fostering a dynamic trading environment is reflected in our company culture and values. Tasks in the roleSupporting the existing product functionality and developing new features in PHP, JS and Golang.RequirementsThe Full Stack Developer role has the following requirements:Experience in PHP development for at least 2 years (proficiency in one of the frameworks: Symfony/Laravel/Yii);Experience in Golang development is a plus;Proficiency in a popular JS framework such as React, Vue, or Angular;Knowledge of HTML/CS

KeyboardInterrupt: 