# Recrawl the job description

## #1 Fix the crawling

In [1]:
import logging
from linkedin_scraper import Job

from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class _Job(Job):
    def __init__(self, **kwargs):
       self.job_title = ""
       self.required_skills = ""
       self.job_type_1 = ""
       self.job_type_2 = ""
 
       super().__init__(**kwargs)
    
    def scrape_logged_in(self, close_on_complete=True):
        driver = self.driver
        
        driver.get(self.linkedin_url)
        self.focus()

        job_description_elem = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'jobs-description')]")
        inner_html = job_description_elem.get_attribute(name="innerHTML")
        soup = BeautifulSoup(inner_html, 'html.parser')
        
        self.job_description = '\n'.join(
            elem.get_text() for elem in soup.find_all() if elem.name in ["span", "ul", "strong", "li"]
        )

        if close_on_complete:
            driver.close()

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from linkedin_scraper import actions

import os

def set_chrome_options() -> Options:
    """Sets chrome options for Selenium.
    Chrome options for headless browser is enabled.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_prefs = {}
    chrome_options.experimental_options["prefs"] = chrome_prefs
    chrome_prefs["profile.default_content_settings"] = {"images": 2}
    return chrome_options
    
def crawl_jd(driver, job_link):
    job = _Job(linkedin_url=job_link, driver=driver, close_on_complete=False, scrape=True)
    return job.job_description

## Main method

In [3]:
driver = webdriver.Chrome(options=set_chrome_options())
actions.login(driver, os.environ["EMAIL"], os.environ["PWORD"]) # if email and password isnt given, it'll prompt in terminal
print("... Logged in.") 

... Logged in.


In [None]:
%%time
from datetime import datetime, timedelta
import pandas as pd

import os
from pathlib import Path

from selenium.common.exceptions import TimeoutException

from utils import standardize_job_urls

data_dir_path = Path(os.path.abspath(".."), "data")

start_date = datetime(2023, 10, 28)
end_date = datetime(2023, 11, 4)

date = start_date
while date <= end_date:
    csv_path = Path(data_dir_path, f"crawled_jobs_{date.strftime('%Y-%m-%d')}.csv")
    print(csv_path)
    
    new_daily_data = pd.read_csv(csv_path)
    new_daily_data = new_daily_data.pipe(standardize_job_urls)
    # new_daily_data.loc[:,"job_description"] = new_daily_data.linkedin_url.map(
    #     lambda job_link: crawl_jd(driver, job_link)
    # )
    if "jd_recrawled" not in new_daily_data.columns:
        new_daily_data.loc[:,"jd_recrawled"] = False
    for i, row in new_daily_data.iterrows():
        if row.jd_recrawled:
            continue
            
        try:
            jd = crawl_jd(driver, row.linkedin_url)
            new_daily_data.loc[i,"job_description"] = jd
            new_daily_data.loc[i,"jd_recrawled"] = True
        except TimeoutException:
            pass

        if (i+1)%10==0:
            print(f"...{i+1} of {len(new_daily_data)} jobs done.")
            new_daily_data.to_csv(csv_path, index=False)
        
    new_daily_data.to_csv(csv_path, index=False)
    print("... CSV re-crawling done.")
    print()

    date += timedelta(days=1)

/home/parallels/repos/ds-employment-landscape/data/crawled_jobs_2023-10-28.csv
...130 of 383 jobs done.
...140 of 383 jobs done.
...150 of 383 jobs done.
...160 of 383 jobs done.
...170 of 383 jobs done.
...180 of 383 jobs done.
...190 of 383 jobs done.
...200 of 383 jobs done.
...210 of 383 jobs done.
...220 of 383 jobs done.
...230 of 383 jobs done.
...240 of 383 jobs done.
...250 of 383 jobs done.
...260 of 383 jobs done.
...270 of 383 jobs done.
...280 of 383 jobs done.
...290 of 383 jobs done.
...300 of 383 jobs done.
...310 of 383 jobs done.
...320 of 383 jobs done.
...330 of 383 jobs done.
...340 of 383 jobs done.
...350 of 383 jobs done.
...360 of 383 jobs done.
...370 of 383 jobs done.
...380 of 383 jobs done.
... CSV re-crawling done.

/home/parallels/repos/ds-employment-landscape/data/crawled_jobs_2023-10-29.csv
...100 of 373 jobs done.
...110 of 373 jobs done.
...120 of 373 jobs done.
...130 of 373 jobs done.
...140 of 373 jobs done.
...150 of 373 jobs done.
...160 of 373 j